def get_paper(link):
    paper = link.split("/")[2]
    return paper

import pickle
import datetime

#output files
clust_out = open("NER - Story Dataset - Single Paper.csv", "w")
clust_out.write(";".join(
        ("story", "num_articles", "duration", "other_papers?")))
clust_out.write("\n")

#input file
obj = pickle.load(open("communities (2).pickle", "rb"))

print("Processing")

#each item in this dictionary is a story cluster
#may need to be divided into multiple, single paper story clusters
for cluster in obj:

    papers = {}

    for story in obj[cluster]:

        #list of articles per paper in the cluster
        paper = get_paper(story)
        if not paper in papers:
            papers[paper] = {}
            papers[paper]["storylist"]=[]
            #later/earlier than any article in the dataset
            papers[paper]["start_date"]= datetime.datetime(2015, 1, 1, 1, 1, 1)
            papers[paper]["end_date"] = datetime.datetime(2010, 1, 1, 1, 1, 1)
        papers[paper]["storylist"].append(story)

        if obj[cluster][story]["time"] < papers[paper]["start_date"]:
            papers[paper]["start_date"] = obj[cluster][story]["time"]

        if obj[cluster][story]["time"] > papers[paper]["end_date"]:
            papers[paper]["end_date"] = obj[cluster][story]["time"]


    for paper in papers:
        #ignore single story clusters
        if len(papers[paper]["storylist"])==1:
            continue

        #turn days and seconds into hours
        duration = (papers[paper]["end_date"] - papers[paper]["start_date"]).days
        duration = duration * 24
        remainder = (papers[paper]["end_date"] - papers[paper]["start_date"]).seconds
        remainder = (remainder / 60) / 60
        duration = duration + remainder

        num_papers = len(set(papers))
        num_articles = len(papers[paper]["storylist"])

        clust_out.write(";".join(
            (cluster, str(num_articles), str(duration), str(num_papers))))
        clust_out.write("\n")

clust_out.close()

print("Done")

            
            

    

